Using the kiln pressure data we can generate features to aid in concreteley determining whether certain patterns produce better yields. Some of the features discussed include:
Area under the curve was originally going to be calculated but after some thought, time spent within the pressure ranges seemed a more useful metric. Could potentially revisit an AUC calculation later on.
press_calcs <- kilns_All %>%
group_by(LOTNO) %>%
mutate(
# adust pressure max and min
pressure = case_when(pressure > .07 ~ .07,
pressure < -.07 ~ -.07,
TRUE ~ pressure
),
# find time index where temp reaches 1200F
close_1200 = if_else( (time < index_max_temp), (abs(1200 - avg_kiln_temp)), NULL ))
# join time index to original
press_calcs <- left_join(press_calcs, press_calcs %>%
group_by(LOTNO) %>%
dplyr::summarise(index_1200F = which.min(close_1200))
)
press_calcs <- press_calcs %>%
group_by(LOTNO) %>%
mutate(
# segregate pressures into positive and negative columns
press_pos = if_else((pressure >= 0) & (time <= index_1200F), pressure, NULL),
press_neg = if_else((pressure < 0) & (time <= index_1200F), pressure, NULL),
# get all pressures below 1200F point for easy mean, sd calcs
press_1200F = if_else(time <= index_1200F, pressure, NULL),
# segregate pressures into range columns
press_greater_03 = if_else((pressure >= .03) & (time <= index_1200F), pressure, NULL),
press_betw_02_03 = if_else((pressure >= .02) & (pressure < .03) & (time <= index_1200F), pressure, NULL),
press_betw_01_02 = if_else((pressure >= .01) & (pressure < .02) & (time <= index_1200F), pressure, NULL),
press_betw_00_01 = if_else((pressure >= .0) & (pressure < .01) & (time <= index_1200F), pressure, NULL),
press_less_00 = if_else((pressure < 0) & (time <= index_1200F), pressure, NULL),
# add counter columns for easy aggregation of total times spent in range
press_pos_count = if_else(!is.na(press_pos),1,0),
press_neg_count = if_else(!is.na(press_neg),1,0),
press_greater_03_count = if_else(!is.na(press_greater_03),1,0),
press_betw_02_03_count = if_else(!is.na(press_betw_02_03),1,0),
press_betw_01_02_count = if_else(!is.na(press_betw_01_02),1,0),
press_betw_00_01_count = if_else(!is.na(press_betw_00_01),1,0),
press_less_00_count = if_else(!is.na(press_less_00),1,0)
) %>%
# remove helper columns
dplyr::select(-c(close_1200)) %>%
dplyr::select(c(LOTNO,index_max_temp,index_1200F,everything()))
# summarise times at pressures
press_summ <- press_calcs %>%
group_by(LOTNO) %>%
dplyr::summarise(
# sum times at negative and positive pressure
time_at_pos = sum(press_pos_count, na.rm=TRUE),
time_at_neg = sum(press_neg_count, na.rm=TRUE),
# get means and sd of pressures
mean_press = mean(press_1200F, na.rm=TRUE),
sd_press = sd(press_1200F, na.rm=TRUE),
# sum times at pressure ranges
time_greater_03 = sum(press_greater_03_count, na.rm=TRUE),
time_betw_02_03 = sum(press_betw_02_03_count, na.rm=TRUE),
time_betw_01_02 = sum(press_betw_01_02_count, na.rm=TRUE),
time_betw_00_01 = sum(press_betw_00_01_count, na.rm=TRUE),
time_less_00 = sum(press_less_00_count, na.rm=TRUE)
)
# join summarised features to original pressure data for easy plotting and verification
press_joined <- left_join(press_calcs, press_summ) %>%
mutate(
pct_time_at_pos = time_at_pos/index_1200F,
pct_time_at_neg = time_at_neg/index_1200F,
pct_time_greater_03 = time_greater_03/index_1200F,
pct_time_betw_02_03 = time_betw_02_03/index_1200F,
pct_time_betw_01_02 = time_betw_01_02/index_1200F,
pct_time_betw_00_01 = time_betw_00_01/index_1200F,
pct_time_less_00 = time_less_00/index_1200F
) %>%
# remove helper columns
dplyr::select(-c(press_pos_count,
press_neg_count,
press_greater_03_count,
press_betw_02_03_count,
press_betw_01_02_count,
press_betw_00_01_count,
press_less_00_count,
press_1200F
))
# attached aucDiff to each df
press_summ <- left_join(press_summ, allAucValues)
press_joined <- left_join(press_joined, allAucValues)
# add kiln column for quick filtering
press_summ <- press_summ %>%
mutate(kiln = as.factor(str_extract(LOTNO, "[:alpha:]")))
# add weather data as well..
press_summ <- df_yields %>%
group_by(LOTNO) %>% slice(1) %>% ungroup() %>%
dplyr::select(LOTNO, precip:temp_avg) %>%
right_join(press_summ)
# prepare lot yield metric
# get lot yields by summarising fired and rejects
lot_yields <-
df_yields %>%
group_by(LOTNO) %>%
dplyr::summarise(
lot_items_fired = sum(TOTAL_ITEM_FIRED),
lot_items_rejected = sum(TOTAL_ITEM_REJECTED),
lot_yield = (lot_items_fired - lot_items_rejected) / lot_items_fired
) %>%
dplyr::select(LOTNO, lot_yield)
# join yields with features, remove 20 missing lot yield values
press_summ <- left_join(press_summ, lot_yields) %>%
na.omit()
#
# significance function
cor.mtest <- function(mat, ...) {
mat <- as.matrix(mat)
n <- ncol(mat)
p.mat<- matrix(NA, n, n)
diag(p.mat) <- 0
for (i in 1:(n - 1)) {
for (j in (i + 1):n) {
tmp <- cor.test(mat[, i], mat[, j], ...)
p.mat[i, j] <- p.mat[j, i] <- tmp$p.value
}
}
colnames(p.mat) <- rownames(p.mat) <- colnames(mat)
p.mat
}
Quick visualization of randomly selected lots to verify that metrics seem to be produced correctly.
# sample random lots
set.seed(123)
sampleC <- sample(levels(kilns_C$LOTNO),1)
sampleD <- sample(levels(kilns_D$LOTNO),1)
sampleE <- sample(levels(kilns_E$LOTNO),1)
sampleF <- sample(levels(kilns_F$LOTNO),1)
sampleG <- sample(levels(kilns_G$LOTNO),2)
sampleH <- sample(levels(kilns_H$LOTNO),2)
sample <- c(sampleC, sampleD, sampleE, sampleF)
# scale and shift values for second y-axis
sec_y_scale=20000
sec_y_shift=1500
# plot
press_joined %>%
dplyr::filter(LOTNO %in% sample) %>%
# KILN TEMP, SETPOINT, PRESSURE
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# PRESSURE POINTS
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
# HLINES
geom_hline(aes(yintercept = .0 * sec_y_scale + sec_y_shift),color='red')+
geom_hline(aes(yintercept = .01 * sec_y_scale + sec_y_shift),color='red',linetype='dashed')+
geom_hline(aes(yintercept = .02 * sec_y_scale + sec_y_shift),color='green',linetype='dashed')+
geom_hline(aes(yintercept = .03 * sec_y_scale + sec_y_shift),color='red',linetype='dashed')+
# mean/sd
# geom_hline(aes(yintercept = mean_press * sec_y_scale + sec_y_shift), color = 'blue',linetype='dotdash',size=1)+
# geom_hline(aes(yintercept = (mean_press+sd_press) * sec_y_scale + sec_y_shift), color = 'blue2',linetype='dotted',size=1)+
# geom_hline(aes(yintercept = (mean_press-sd_press) * sec_y_scale + sec_y_shift), color = 'blue2',linetype='dotted',size=1)+
# PRESSURE RIBBONS
# geom_ribbon(aes(ymin = press_less_00 * sec_y_scale + sec_y_shift, ymax = 0 * sec_y_scale + sec_y_shift),fill='red',alpha=1)+
# geom_ribbon(aes(ymin = 0.00 * sec_y_scale + sec_y_shift, ymax = press_betw_00_01 * sec_y_scale +sec_y_shift),fill='yellow3',alpha=1)+
# geom_ribbon(aes(ymin = 0.0 * sec_y_scale + sec_y_shift, ymax = press_betw_01_03 * sec_y_scale + sec_y_shift),fill='green',alpha=1)+
# geom_ribbon(aes(ymin = 0.0 * sec_y_scale + sec_y_shift, ymax = press_greater_03 * sec_y_scale + sec_y_shift),fill='red',alpha=1)+
# zoom on ribbons
# scale_x_continuous(limits = c(0,18.4*60))+
# scale_y_continuous(limits = c(1250,2350))
# setpoint, temp AUC ribbon
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# labels
# AUC
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(aucDiff),
aes(x = 60 * 30, y = -0.065 * sec_y_scale + sec_y_shift,
label = paste0( "AUC: ", round(aucDiff,0) )
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='pink',label.size=.1
) +
# MEAN
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(mean_press),
aes(x = 60 * 30, y = -0.02 * sec_y_scale + sec_y_shift,
label = paste0( "mean: ", round(mean_press,3) )
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='lightskyblue',label.size=.1
) +
# SD
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(sd_press),
aes(x = 60 * 30, y = -0.03 * sec_y_scale + sec_y_shift,
label = paste0( "sd: ", round(sd_press,3) )
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='lightskyblue2',label.size=.1
) +
# POS PRESS
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_at_pos, pct_time_at_pos, index_1200F),
aes(x = 60 * 72, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( time_at_pos, "m, ",
mypercent(pct_time_at_pos))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='grey80',label.size=.1
) +
# NEG PRESS
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_at_neg, pct_time_at_neg, index_1200F),
aes(x = 60 * 72, y = -0.06 * sec_y_scale + sec_y_shift,
label = paste0( time_at_neg, "m, ",
mypercent(pct_time_at_neg))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='grey90'
)+
# LESS 00
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_less_00, pct_time_less_00, index_1200F),
aes(x = 60 * 72, y = -.02 * sec_y_scale + sec_y_shift,
label = paste0( time_less_00, "m, ",
mypercent(pct_time_less_00))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='red'
)+
# BETW 00 01
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_betw_00_01, pct_time_betw_00_01, index_1200F),
aes(x = 60 * 72, y = 0.0 * sec_y_scale + sec_y_shift,
label = paste0( time_betw_00_01, "m, ",
mypercent(pct_time_betw_00_01))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='yellow3'
) +
# BETW 01 02
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_betw_01_02, pct_time_betw_01_02, index_1200F),
aes(x = 60 * 72, y = .015 * sec_y_scale + sec_y_shift,
label = paste0( time_betw_01_02, "m, ",
mypercent(pct_time_betw_01_02))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='green'
) +
# BETW 02 03
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_betw_02_03, pct_time_betw_02_03, index_1200F),
aes(x = 60 * 72, y = .025 * sec_y_scale + sec_y_shift,
label = paste0( time_betw_02_03, "m, ",
mypercent(pct_time_betw_02_03))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='green3'
) +
# GREATER 03
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_greater_03, pct_time_greater_03, index_1200F),
aes(x = 60 * 72, y = 0.04 * sec_y_scale + sec_y_shift,
label = paste0( time_greater_03, "m, ",
mypercent(pct_time_greater_03))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='red'
) +
facet_wrap(~LOTNO)
# sample random lots
set.seed(8634)
sample <- sample(levels(kilns_All$LOTNO),4)
# plot
press_joined %>%
dplyr::filter(LOTNO %in% sample) %>%
# KILN TEMP, SETPOINT, PRESSURE
ggplot(aes(x=time))+
geom_line(aes(y=avg_kiln_temp))+
geom_line(aes(y=setpoint),color='grey50',linetype='dashed')+
geom_point(aes(y=pressure * sec_y_scale + sec_y_shift),alpha=.1,size=.1)+
# PRESSURE POINTS
geom_point(aes(y=press_betw_02_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green3')+
geom_point(aes(y=press_betw_01_02 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='green')+
geom_point(aes(y=press_greater_03 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
geom_point(aes(y=press_betw_00_01 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='yellow2')+
geom_point(aes(y=press_less_00 * sec_y_scale + sec_y_shift),alpha=1,size=.1,color='red')+
# HLINES
geom_hline(aes(yintercept = .0 * sec_y_scale + sec_y_shift),color='red')+
geom_hline(aes(yintercept = .01 * sec_y_scale + sec_y_shift),color='red',linetype='dashed')+
geom_hline(aes(yintercept = .02 * sec_y_scale + sec_y_shift),color='green',linetype='dashed')+
geom_hline(aes(yintercept = .03 * sec_y_scale + sec_y_shift),color='red',linetype='dashed')+
# mean/sd
# geom_hline(aes(yintercept = mean_press * sec_y_scale + sec_y_shift), color = 'blue',linetype='dotdash',size=1)+
# geom_hline(aes(yintercept = (mean_press+sd_press) * sec_y_scale + sec_y_shift), color = 'blue2',linetype='dotted',size=1)+
# geom_hline(aes(yintercept = (mean_press-sd_press) * sec_y_scale + sec_y_shift), color = 'blue2',linetype='dotted',size=1)+
# PRESSURE RIBBONS
# geom_ribbon(aes(ymin = press_less_00 * sec_y_scale + sec_y_shift, ymax = 0 * sec_y_scale + sec_y_shift),fill='red',alpha=1)+
# geom_ribbon(aes(ymin = 0.00 * sec_y_scale + sec_y_shift, ymax = press_betw_00_01 * sec_y_scale +sec_y_shift),fill='yellow3',alpha=1)+
# geom_ribbon(aes(ymin = 0.0 * sec_y_scale + sec_y_shift, ymax = press_betw_01_03 * sec_y_scale + sec_y_shift),fill='green',alpha=1)+
# geom_ribbon(aes(ymin = 0.0 * sec_y_scale + sec_y_shift, ymax = press_greater_03 * sec_y_scale + sec_y_shift),fill='red',alpha=1)+
# zoom on ribbons
# scale_x_continuous(limits = c(0,18.4*60))+
# scale_y_continuous(limits = c(1250,2350))
# setpoint, temp AUC ribbon
geom_ribbon(aes(ymin = auc_min, ymax = auc_max),fill='pink',alpha=1)+
scale_y_continuous(name = "Kiln temp (F)",
breaks = sort(c(seq(0,3000,1000),400,600,1200)),
sec.axis = sec_axis(~. / sec_y_scale - (sec_y_shift / sec_y_scale),
name = "Pressure",
breaks = seq(-.1,.1,.01)
)
)+
scale_x_continuous(name = "Hours",
labels = number_format(scale=1/60),
breaks = sort(c(seq(0,100*60,12*60)))
)+
# labels
# AUC
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(aucDiff),
aes(x = 60 * 30, y = -0.065 * sec_y_scale + sec_y_shift,
label = paste0( "AUC: ", round(aucDiff,0) )
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='pink',label.size=.1
) +
# MEAN
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(mean_press),
aes(x = 60 * 30, y = -0.02 * sec_y_scale + sec_y_shift,
label = paste0( "mean: ", round(mean_press,3) )
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='lightskyblue',label.size=.1
) +
# SD
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(sd_press),
aes(x = 60 * 30, y = -0.03 * sec_y_scale + sec_y_shift,
label = paste0( "sd: ", round(sd_press,3) )
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='lightskyblue2',label.size=.1
) +
# POS PRESS
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_at_pos, pct_time_at_pos, index_1200F),
aes(x = 60 * 72, y = -0.04 * sec_y_scale + sec_y_shift,
label = paste0( time_at_pos, "m, ",
mypercent(pct_time_at_pos))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='grey80',label.size=.1
) +
# NEG PRESS
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_at_neg, pct_time_at_neg, index_1200F),
aes(x = 60 * 72, y = -0.06 * sec_y_scale + sec_y_shift,
label = paste0( time_at_neg, "m, ",
mypercent(pct_time_at_neg))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='grey90'
)+
# LESS 00
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_less_00, pct_time_less_00, index_1200F),
aes(x = 60 * 72, y = -.02 * sec_y_scale + sec_y_shift,
label = paste0( time_less_00, "m, ",
mypercent(pct_time_less_00))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='red'
)+
# BETW 00 01
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_betw_00_01, pct_time_betw_00_01, index_1200F),
aes(x = 60 * 72, y = 0.0 * sec_y_scale + sec_y_shift,
label = paste0( time_betw_00_01, "m, ",
mypercent(pct_time_betw_00_01))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='yellow3'
) +
# BETW 01 02
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_betw_01_02, pct_time_betw_01_02, index_1200F),
aes(x = 60 * 72, y = .015 * sec_y_scale + sec_y_shift,
label = paste0( time_betw_01_02, "m, ",
mypercent(pct_time_betw_01_02))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='green'
) +
# BETW 02 03
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_betw_02_03, pct_time_betw_02_03, index_1200F),
aes(x = 60 * 72, y = .025 * sec_y_scale + sec_y_shift,
label = paste0( time_betw_02_03, "m, ",
mypercent(pct_time_betw_02_03))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='green3'
) +
# GREATER 03
geom_label(data = press_joined %>% dplyr::filter(LOTNO %in% sample) %>% distinct(time_greater_03, pct_time_greater_03, index_1200F),
aes(x = 60 * 72, y = 0.04 * sec_y_scale + sec_y_shift,
label = paste0( time_greater_03, "m, ",
mypercent(pct_time_greater_03))
),
label.padding = unit(0.2, "lines"),hjust=1,vjust=.5,fill='red'
) +
facet_wrap(~LOTNO)
Distribution check of new features shows generally skewed and heavy-tailed data.
# pivot data and plot histograms
press_summ %>%
dplyr::select(-LOTNO) %>%
dplyr::select(time_at_pos:lot_yield) %>%
pivot_longer(cols = is.numeric) %>%
mutate_if(is.character, factor) %>%
ggplot(aes(x=value))+
geom_freqpoly(bins=15)+
facet_wrap(~name, scales='free')
Mean pressure while temperature is less than 1200F is much higher for kilns G and H.
press_summ %>%
mutate(kiln = as.factor(str_extract(LOTNO, "[:alpha:]"))) %>%
dplyr::select(-LOTNO) %>%
dplyr::select(mean_press,kiln) %>%
ggplot(aes(x=mean_press))+
geom_vline(aes(xintercept=0),linetype='dashed',color='red')+
geom_freqpoly(bins=15)+
facet_wrap(~kiln)
press_summ %>%
mutate(kiln = as.factor(str_extract(LOTNO, "[:alpha:]"))) %>%
dplyr::select(-LOTNO) %>%
dplyr::select(time_less_00,kiln) %>%
ggplot(aes(x=time_less_00))+
geom_vline(aes(xintercept=0),linetype='dashed',color='red')+
geom_freqpoly(bins=15)+
facet_wrap(~kiln)
press_summ %>%
mutate(kiln = as.factor(str_extract(LOTNO, "[:alpha:]"))) %>%
dplyr::select(-LOTNO) %>%
dplyr::select(time_betw_00_01,kiln) %>%
ggplot(aes(x=time_betw_00_01))+
geom_vline(aes(xintercept=0),linetype='dashed',color='red')+
geom_freqpoly(bins=15)+
facet_wrap(~kiln)
Kilns G (especially) and H have far more time between .01 and .02 pressures.
press_summ %>%
mutate(kiln = as.factor(str_extract(LOTNO, "[:alpha:]"))) %>%
dplyr::select(-LOTNO) %>%
dplyr::select(time_betw_01_02,kiln) %>%
ggplot(aes(x=time_betw_01_02))+
geom_vline(aes(xintercept=0),linetype='dashed',color='red')+
geom_freqpoly(bins=15)+
facet_wrap(~kiln)
Kilns G (especially) and H have far more time between .02 and .03 pressures.
press_summ %>%
mutate(kiln = as.factor(str_extract(LOTNO, "[:alpha:]"))) %>%
dplyr::select(-LOTNO) %>%
dplyr::select(time_betw_02_03,kiln) %>%
ggplot(aes(x=time_betw_02_03))+
geom_vline(aes(xintercept=0),linetype='dashed',color='red')+
geom_freqpoly(bins=15)+
facet_wrap(~kiln)
press_summ %>%
mutate(kiln = as.factor(str_extract(LOTNO, "[:alpha:]"))) %>%
dplyr::select(-LOTNO) %>%
dplyr::select(time_greater_03,kiln) %>%
ggplot(aes(x=time_greater_03))+
geom_vline(aes(xintercept=0),linetype='dashed',color='red')+
geom_freqpoly(bins=15)+
facet_wrap(~kiln)
Quick check of linear correlation of lot yields versus all features (including temps).
Only kiln E exhibits significant linear correlations (p > 0.05) with any pressure features.
No significant correlations.
M <- press_summ %>%
dplyr::filter(kiln == "C") %>%
dplyr::select(c(-LOTNO,-kiln,-time_at_pos,-time_at_neg)) %>%
cor()
M <- M[-9,-9]
p.mat <- cor.mtest(M)
# corrplot(as.matrix(M[8,]), method="number",
# p.mat = as.matrix(p.mat[8,])
# , sig.level = 0.05)
corrplot(as.matrix(M[14,]), method="number",
p.mat = as.matrix(p.mat[14,]), sig.level = 0.05)
Many Weak negative correlations, especially with time_betw_01_02, time_betw_02_03, and aucDiff.
M <- press_summ %>%
dplyr::filter(kiln == "D") %>%
dplyr::select(c(-LOTNO,-kiln,-time_at_pos,-time_at_neg)) %>%
cor()
p.mat <- cor.mtest(M)
corrplot(as.matrix(M[15,]), method="number",
p.mat = as.matrix(p.mat[15])
, sig.level = 0.05)
press_summ %>%
dplyr::filter(kiln == "D") %>%
ggplot(aes(x=time_betw_01_02, y=lot_yield,group=time_betw_02_03))+
geom_boxplot()+
geom_boxplot(aes(x=time_betw_01_02))
# geom_jitter(width=.3,alpha=.5,height=0,shape=21)
Strong negative correlations with sd_press, time_greater_03, and time_betw_02_03. As these values increase, lot yields decrease.
M <- press_summ %>%
dplyr::filter(kiln == "E") %>%
dplyr::select(c(-LOTNO,-kiln,-time_at_pos,-time_at_neg)) %>%
cor()
p.mat <- cor.mtest(M)
corrplot(as.matrix(M[15,]), method = "number",
p.mat = as.matrix(p.mat[15,])
, sig.level = 0.05)
g1 <- press_summ %>%
dplyr::filter(kiln == "E") %>%
ggplot(aes(x=sd_press, y=lot_yield))+
geom_point()+
geom_smooth(method='lm')
g2 <- press_summ %>%
dplyr::filter(kiln == "E") %>%
ggplot(aes(x=time_betw_02_03, y=lot_yield, group=time_betw_02_03))+
geom_boxplot()+
geom_jitter(width=.3,alpha=.5,height=0,shape=21)
g3 <- press_summ %>%
dplyr::filter(kiln == "E") %>%
ggplot(aes(x=time_greater_03, y=lot_yield,group=time_greater_03))+
geom_boxplot()+
geom_jitter(width=.3,alpha=.5,height=0,shape=21)
grid.arrange(g1,g2,g3,nrow=1)
No significant correlations.
M <- press_summ %>%
dplyr::filter(kiln == "F") %>%
dplyr::select(c(-LOTNO,-kiln,-time_at_pos,-time_at_neg)) %>%
cor()
p.mat <- cor.mtest(M)
corrplot(as.matrix(M[15,]), method = "number",
p.mat = as.matrix(p.mat[15,])
, sig.level = 0.05)
No significant correlations.
M <- press_summ %>%
dplyr::filter(kiln == "G") %>%
dplyr::select(c(-LOTNO,-kiln,-time_at_pos,-time_at_neg)) %>%
cor()
p.mat <- cor.mtest(M)
corrplot(as.matrix(M[15,]), method = "number",
p.mat = as.matrix(p.mat[15,])
, sig.level = 0.05)
No significant correlations.
M <- press_summ %>%
dplyr::filter(kiln == "H") %>%
dplyr::select(c(-LOTNO,-kiln,-time_at_pos,-time_at_neg)) %>%
cor()
p.mat <- cor.mtest(M)
corrplot(as.matrix(M[15,]), method = "number",
p.mat = as.matrix(p.mat[15,])
, sig.level = 0.05)
\(R^2\) values in general are very poor with the exception of kiln E.
# linear model on each kiln
df <- press_summ %>%
dplyr::select(-LOTNO,-time_at_pos,-time_at_neg, -temp_avg)
lms <- df %>%
nest(data = c(-kiln)) %>%
mutate(fit = map(data, ~lm(lot_yield ~ ., data = .x)),
tidied = map(fit, tidy),
glanced = map(fit, glance),
augmented = map(fit, augment)
)
# glanced
lm_glance <- lms %>%
unnest(glanced) %>%
select(-data,-fit,-tidied,-augmented) %>%
select(kiln, r.squared,AIC,BIC) %>%
arrange(-r.squared) %>%
mutate_if(is.numeric, round, 5)
kable(lm_glance,
format="html") %>%
kable_styling(full_width=T)
| kiln | r.squared | AIC | BIC |
|---|---|---|---|
| E | 0.99846 | -89.84502 | -79.22427 |
| D | 0.23664 | -217.41242 | -180.42380 |
| C | 0.21330 | -128.16283 | -100.57875 |
| G | 0.09174 | -490.19878 | -438.49772 |
| H | 0.08479 | -303.27661 | -255.21651 |
| F | 0.03784 | -177.39467 | -135.96207 |
# tidied
lm_tidy <- lms %>%
unnest(tidied) %>%
select(-data,-fit,-glanced,-augmented) %>%
arrange(kiln, p.value) %>%
mutate_if(is.numeric, round, 5) %>%
mutate(
p.value = ifelse(p.value < .05,
cell_spec(p.value,"html",color="red"),
cell_spec(p.value,"html",color="black"))
)
kable(lm_tidy, format="html",escape = F) %>%
pack_rows( index=c("C" = 14,"D" = 14,"E" = 14,"F" = 14,"G" = 14,"H" = 14)) %>%
kable_styling("striped", full_width=T)
| kiln | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|---|
| C | |||||
| C | (Intercept) | 0.85120 | 0.09854 | 8.63805 | 0 |
| C | sd_press | 11.56149 | 11.55899 | 1.00022 | 0.32322 |
| C | temp_max | 0.00106 | 0.00111 | 0.95615 | 0.34474 |
| C | aucDiff | 0.00000 | 0.00000 | -0.83611 | 0.40806 |
| C | mean_press | -8.00199 | 9.73070 | -0.82235 | 0.41576 |
| C | precip | -0.03041 | 0.04382 | -0.69409 | 0.49164 |
| C | time_less_00 | -0.00003 | 0.00006 | -0.59942 | 0.55227 |
| C | time_betw_00_01 | 0.00003 | 0.00006 | 0.51597 | 0.60871 |
| C | temp_min | 0.00060 | 0.00128 | 0.46870 | 0.64183 |
| C | snow_fall | -0.00245 | 0.01420 | -0.17259 | 0.86385 |
| C | snow_depth | -0.00078 | 0.00453 | -0.17238 | 0.86401 |
| C | time_betw_01_02 | 0.00026 | 0.00244 | 0.10564 | 0.9164 |
| C | time_betw_02_03 | -0.00005 | 0.00717 | -0.00764 | 0.99394 |
| C | time_greater_03 | NA | NA | NA | NA |
| D | |||||
| D | (Intercept) | 0.91481 | 0.05624 | 16.26736 | 0 |
| D | time_betw_02_03 | -0.05878 | 0.02283 | -2.57484 | 0.01205 |
| D | aucDiff | 0.00000 | 0.00000 | -2.18340 | 0.03222 |
| D | snow_fall | 0.02275 | 0.01551 | 1.46668 | 0.14676 |
| D | time_less_00 | 0.00007 | 0.00006 | 1.35300 | 0.18023 |
| D | time_betw_01_02 | -0.00128 | 0.00117 | -1.08975 | 0.27941 |
| D | snow_depth | -0.00339 | 0.00344 | -0.98540 | 0.32768 |
| D | time_greater_03 | 0.01171 | 0.01829 | 0.64028 | 0.524 |
| D | sd_press | -3.84453 | 7.07149 | -0.54367 | 0.58833 |
| D | time_betw_00_01 | 0.00004 | 0.00009 | 0.48635 | 0.62817 |
| D | mean_press | 4.34886 | 14.00702 | 0.31048 | 0.75708 |
| D | temp_min | -0.00027 | 0.00105 | -0.25494 | 0.79948 |
| D | precip | 0.00619 | 0.02769 | 0.22368 | 0.82363 |
| D | temp_max | 0.00008 | 0.00095 | 0.07914 | 0.93714 |
| E | |||||
| E | time_betw_01_02 | 0.00182 | 0.00018 | 10.32889 | 0.06144 |
| E | mean_press | -108.85279 | 14.60849 | -7.45134 | 0.08493 |
| E | time_betw_00_01 | 0.00078 | 0.00014 | 5.41257 | 0.11631 |
| E | temp_min | 0.00823 | 0.00185 | 4.45509 | 0.14057 |
| E | temp_max | -0.00621 | 0.00145 | -4.27460 | 0.1463 |
| E | (Intercept) | 0.58429 | 0.20431 | 2.85989 | 0.21414 |
| E | snow_depth | 0.08660 | 0.03851 | 2.24871 | 0.26639 |
| E | snow_fall | -0.18387 | 0.09017 | -2.03903 | 0.29027 |
| E | time_less_00 | 0.00026 | 0.00013 | 2.00809 | 0.29414 |
| E | time_greater_03 | -0.06790 | 0.03802 | -1.78606 | 0.32493 |
| E | aucDiff | 0.00000 | 0.00000 | -0.93358 | 0.52186 |
| E | precip | -0.17567 | 0.21740 | -0.80802 | 0.56735 |
| E | sd_press | -18.66761 | 32.50378 | -0.57432 | 0.66811 |
| E | time_betw_02_03 | 0.00242 | 0.02054 | 0.11797 | 0.92525 |
| F | |||||
| F | (Intercept) | 0.88886 | 0.09748 | 9.11789 | 0 |
| F | temp_max | 0.00170 | 0.00129 | 1.32252 | 0.18892 |
| F | temp_min | -0.00172 | 0.00140 | -1.22340 | 0.22397 |
| F | aucDiff | 0.00000 | 0.00000 | 0.75490 | 0.45203 |
| F | time_betw_01_02 | 0.00028 | 0.00046 | 0.61192 | 0.54194 |
| F | sd_press | 2.99793 | 7.30504 | 0.41039 | 0.68237 |
| F | time_greater_03 | 0.00088 | 0.00237 | 0.37248 | 0.7103 |
| F | time_less_00 | -0.00003 | 0.00008 | -0.37204 | 0.71063 |
| F | time_betw_00_01 | -0.00003 | 0.00008 | -0.37046 | 0.7118 |
| F | snow_depth | -0.00137 | 0.00484 | -0.28324 | 0.77756 |
| F | snow_fall | -0.00484 | 0.01971 | -0.24561 | 0.80647 |
| F | precip | -0.00835 | 0.04027 | -0.20746 | 0.83606 |
| F | time_betw_02_03 | -0.00017 | 0.00186 | -0.09120 | 0.92751 |
| F | mean_press | 0.26109 | 11.37128 | 0.02296 | 0.98173 |
| G | |||||
| G | (Intercept) | 0.69377 | 0.07871 | 8.81458 | 0 |
| G | time_greater_03 | -0.00060 | 0.00019 | -3.08980 | 0.00226 |
| G | time_betw_00_01 | 0.00013 | 0.00005 | 2.65998 | 0.0084 |
| G | mean_press | 14.07606 | 5.94712 | 2.36687 | 0.01881 |
| G | time_less_00 | 0.00016 | 0.00009 | 1.90628 | 0.05793 |
| G | sd_press | 6.07222 | 3.25883 | 1.86331 | 0.06376 |
| G | aucDiff | 0.00001 | 0.00000 | 1.24822 | 0.21329 |
| G | time_betw_01_02 | -0.00006 | 0.00006 | -0.95087 | 0.34273 |
| G | snow_fall | -0.00502 | 0.00590 | -0.85106 | 0.39567 |
| G | temp_max | -0.00051 | 0.00066 | -0.77430 | 0.43959 |
| G | time_betw_02_03 | -0.00003 | 0.00010 | -0.31542 | 0.75275 |
| G | snow_depth | 0.00081 | 0.00303 | 0.26627 | 0.79028 |
| G | temp_min | -0.00003 | 0.00075 | -0.04422 | 0.96477 |
| G | precip | -0.00067 | 0.02223 | -0.03026 | 0.97589 |
| H | |||||
| H | (Intercept) | 0.88829 | 0.14374 | 6.17995 | 0 |
| H | aucDiff | 0.00000 | 0.00000 | -2.91740 | 0.00401 |
| H | time_betw_01_02 | 0.00022 | 0.00014 | 1.63055 | 0.10486 |
| H | temp_max | -0.00111 | 0.00102 | -1.09009 | 0.27723 |
| H | time_betw_00_01 | 0.00011 | 0.00012 | 0.93691 | 0.35015 |
| H | mean_press | -7.49653 | 8.88233 | -0.84398 | 0.39988 |
| H | time_betw_02_03 | 0.00026 | 0.00031 | 0.83270 | 0.4062 |
| H | precip | 0.01484 | 0.02994 | 0.49574 | 0.62073 |
| H | time_less_00 | 0.00006 | 0.00016 | 0.36109 | 0.71848 |
| H | snow_fall | 0.00339 | 0.00948 | 0.35727 | 0.72134 |
| H | sd_press | -2.37194 | 7.45432 | -0.31820 | 0.75073 |
| H | snow_depth | -0.00109 | 0.00398 | -0.27476 | 0.78384 |
| H | time_greater_03 | 0.00011 | 0.00065 | 0.16109 | 0.87222 |
| H | temp_min | 0.00015 | 0.00114 | 0.12964 | 0.89701 |
# augmented
lms %>% unnest(augmented) %>%
select(-data,-fit,-tidied,-glanced) %>%
ggplot(aes(x=lot_yield, y=.fitted))+
geom_pointdensity(show.legend = FALSE)+
# geom_point()+
geom_abline(lty=2,color='red')+
facet_wrap(~kiln)+
scale_y_continuous(limits=c(0.6,1))+
scale_x_continuous(limits=c(0.6,1))+
scale_color_viridis_c()
To get an idea of which variables are most important we can use LASSO regression to remove the least important variables. We can also test with a random forest variable importance implementation for comparison sake.
Recall the most significant variables from the earlier linear model based on p-value. Low p-value generally indicates variables are not simply due to random effects. The rather low \(R^2\) indicates that only about 10% of the variability can be explained by the model.
kable(lm_tidy %>% dplyr::filter(kiln == "G"),
format="html",
caption = paste0("Significant variables by p-value, Model R^2: ",
round(lm_glance$r.squared[[4]], 3)),
escape = F) %>%
pack_rows( index=c("Kiln G" = 14)) %>%
kable_styling("striped", full_width=T)
| kiln | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|---|
| Kiln G | |||||
| G | (Intercept) | 0.69377 | 0.07871 | 8.81458 | 0 |
| G | time_greater_03 | -0.00060 | 0.00019 | -3.08980 | 0.00226 |
| G | time_betw_00_01 | 0.00013 | 0.00005 | 2.65998 | 0.0084 |
| G | mean_press | 14.07606 | 5.94712 | 2.36687 | 0.01881 |
| G | time_less_00 | 0.00016 | 0.00009 | 1.90628 | 0.05793 |
| G | sd_press | 6.07222 | 3.25883 | 1.86331 | 0.06376 |
| G | aucDiff | 0.00001 | 0.00000 | 1.24822 | 0.21329 |
| G | time_betw_01_02 | -0.00006 | 0.00006 | -0.95087 | 0.34273 |
| G | snow_fall | -0.00502 | 0.00590 | -0.85106 | 0.39567 |
| G | temp_max | -0.00051 | 0.00066 | -0.77430 | 0.43959 |
| G | time_betw_02_03 | -0.00003 | 0.00010 | -0.31542 | 0.75275 |
| G | snow_depth | 0.00081 | 0.00303 | 0.26627 | 0.79028 |
| G | temp_min | -0.00003 | 0.00075 | -0.04422 | 0.96477 |
| G | precip | -0.00067 | 0.02223 | -0.03026 | 0.97589 |
LASSO regression requires scaled and centered variables and works by removing variables with high correlation or little importance by adjusting their coefficients to zero. This is still a type of linear regression but generally more reliable than relying on p-value.
# original data
df <- press_summ %>%
dplyr::filter(kiln == "G") %>%
dplyr::select(mean_press:lot_yield, -kiln)
# dplyr::select(precip:lot_yield, -kiln)
# splits
set.seed(323)
pressure_split <- initial_split(df)
pressure_train <- training(pressure_split)
pressure_test <- testing(pressure_split)
# lasso recipe
pressure_rec <- recipe(lot_yield ~ ., data = pressure_train) %>%
step_normalize(all_numeric(), -all_outcomes()) %>%
prep()
# lasso workflow
wf <- workflow() %>%
add_recipe(pressure_rec)
# lasso model specification
lasso_spec <- linear_reg(penalty = tune(),
mixture = 1) %>%
set_engine("glmnet")
# generate folds for tuning
set.seed(1434)
lasso_folds <- vfold_cv(pressure_train)
# generate a grid for tuning
lambda_grid <- grid_regular(penalty(), levels = 50)
# tune model
set.seed(211)
lasso_grid <- tune_grid(
wf %>% add_model(lasso_spec),
resamples = lasso_folds,
grid = lambda_grid
)
# visualize tuning results
# lasso_grid %>% collect_metrics() %>%
# ggplot(aes(penalty, mean, color=.metric))+
# geom_errorbar(aes(ymin = mean - std_err,
# ymax = mean + std_err), alpha = .4)+
# geom_point(size=2)+
# geom_line(size=1.5)+
# scale_x_log10()+
# facet_wrap(~.metric,nrow=2,scales="free")
# get best rsq value
best_rsq <- lasso_grid %>% select_best(metric = "rsq")
# best_rmse <- lasso_grid %>% select_best(metric = "rmse")
# finalize model wf
final_lasso_wf <- finalize_workflow(
wf %>% add_model(lasso_spec),
best_rsq
# best_rmse
)
# finalize model
final_lasso_fit <- last_fit(final_lasso_wf, pressure_split)
# save r2
r2 <- final_lasso_fit %>% collect_metrics() %>% select(.estimate) %>% slice(2)
# visualize variable importance on entire data
gg_lasso_importance <- final_lasso_wf %>%
fit(df) %>%
# fit(pressure_train) %>%
pull_workflow_fit() %>%
vi(lambda = best_rsq$penalty) %>%
mutate(Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance)) %>%
ggplot(aes(x=Importance,y=Variable,fill=Sign))+
geom_col()+
labs(y=NULL)+
scale_x_continuous(expand = c(0,0))+
ggtitle(paste0("Variable importance using LASSO"))+
labs(subtitle = paste0("Model R^2: ", round(r2,4)))
gg_lasso_importance
# split train/test
set.seed(3332)
pressure_split <- initial_split(df)
pressure_train <- training(pressure_split)
pressure_test <- testing(pressure_split)
# rf recipe
pressure_rec <- recipe(lot_yield ~ ., data = pressure_train) %>%
prep()
# rf model specification
rf_spec <- rand_forest(mode="regression",
mtry = tune(),
trees = 1000,
min_n = tune()) %>%
set_engine("randomForest")
# workflow
wf <- workflow() %>%
add_recipe(pressure_rec)
# generate folds for tuning
set.seed(131)
rf_folds <- vfold_cv(pressure_train)
# tune on non-specific grid
tune_res <- tune_grid(
wf %>% add_model(rf_spec),
resamples = rf_folds,
grid = 20
)
# visualize first tuning results
# tune_res %>% collect_metrics() %>%
# filter(.metric == "rsq") %>%
# select(mean, min_n, mtry) %>%
# pivot_longer(min_n:mtry, values_to ="value", names_to = "parameter") %>%
# ggplot(aes(value,mean,color=parameter))+
# geom_point()+
# geom_line()+
# facet_wrap(~parameter, nrow=2, scales="free")
# generate more specific grid
rf_grid <- grid_regular(
mtry(range = c(1,8)),
min_n(range = c(20,37)),
levels = 6
)
# tune on more specific grid
regular_res <- tune_grid(
wf %>% add_model(rf_spec),
resamples = rf_folds,
grid = rf_grid
)
# visualize second tuning results
regular_res %>% collect_metrics() %>%
filter(.metric == "rsq") %>%
mutate(min_n = as.factor(min_n)) %>%
ggplot(aes(mtry, mean, color = min_n))+
geom_point()+
geom_line(size=1.2)
# store best models
best_rsq <- regular_res %>% select_best(metric="rsq")
best_rmse <- regular_res %>% select_best(metric="rmse")
# finalize model
final_rf <- finalize_model(
rf_spec,
best_rsq
)
# variable importance on training
# final_rf %>%
# set_engine("randomForest") %>%
# fit(lot_yield ~ ., juice(pressure_rec)) %>%
# vip(geom = "point")
# final random forest workflow
final_rf_wf <- workflow() %>%
add_recipe(pressure_rec) %>%
add_model(final_rf)
# fit model on test set
final_rf_res <- final_rf_wf %>%
last_fit(pressure_split)
# metrics for final model
r2 <- final_rf_res %>% collect_metrics() %>% select(.estimate) %>% slice(2)
# plot predictions vs results on test set
# final_rf_res %>% collect_predictions() %>%
# ggplot(aes(.pred, lot_yield))+
# geom_point()+geom_abline(lty=2,color='red')
# final model
# final_rf_model <- fit(final_rf_wf, df)
# variable importance on all data
gg_rf_importance <- final_rf %>%
set_engine("randomForest") %>%
fit(lot_yield ~ ., df) %>%
vip(geom = "point")+
ggtitle(paste0("Variable importance using RandomForest"))+
labs(subtitle = paste0("Model R^2: ", round(r2,4)))
gg_rf_importance
Recall the most significant variables from the earlier linear model based on p-value. Low p-value generally indicates variables are not simply due to random effects. The rather low \(R^2\) indicates that only about 10% of the variability can be explained by the model.
kable(lm_tidy %>% dplyr::filter(kiln == "H"),
format="html",
caption = paste0("Significant variables by p-value, Model R^2: ",
round(lm_glance$r.squared[[4]], 3)),
escape = F) %>%
pack_rows( index=c("Kiln H" = 14)) %>%
kable_styling("striped", full_width=T)
| kiln | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|---|
| Kiln H | |||||
| H | (Intercept) | 0.88829 | 0.14374 | 6.17995 | 0 |
| H | aucDiff | 0.00000 | 0.00000 | -2.91740 | 0.00401 |
| H | time_betw_01_02 | 0.00022 | 0.00014 | 1.63055 | 0.10486 |
| H | temp_max | -0.00111 | 0.00102 | -1.09009 | 0.27723 |
| H | time_betw_00_01 | 0.00011 | 0.00012 | 0.93691 | 0.35015 |
| H | mean_press | -7.49653 | 8.88233 | -0.84398 | 0.39988 |
| H | time_betw_02_03 | 0.00026 | 0.00031 | 0.83270 | 0.4062 |
| H | precip | 0.01484 | 0.02994 | 0.49574 | 0.62073 |
| H | time_less_00 | 0.00006 | 0.00016 | 0.36109 | 0.71848 |
| H | snow_fall | 0.00339 | 0.00948 | 0.35727 | 0.72134 |
| H | sd_press | -2.37194 | 7.45432 | -0.31820 | 0.75073 |
| H | snow_depth | -0.00109 | 0.00398 | -0.27476 | 0.78384 |
| H | time_greater_03 | 0.00011 | 0.00065 | 0.16109 | 0.87222 |
| H | temp_min | 0.00015 | 0.00114 | 0.12964 | 0.89701 |
LASSO regression requires scaled and centered variables and works by removing variables with high correlation or little importance by adjusting their coefficients to zero. This is still a type of linear regression but generally more reliable than relying on p-value.
# original data
df <- press_summ %>%
dplyr::filter(kiln == "H") %>%
dplyr::select(mean_press:lot_yield, -kiln)
# dplyr::select(precip:lot_yield, -kiln)
# splits
set.seed(32)
pressure_split <- initial_split(df)
pressure_train <- training(pressure_split)
pressure_test <- testing(pressure_split)
# lasso recipe
pressure_rec <- recipe(lot_yield ~ ., data = pressure_train) %>%
step_normalize(all_numeric(), -all_outcomes()) %>%
prep()
# lasso workflow
wf <- workflow() %>%
add_recipe(pressure_rec)
# lasso model specification
lasso_spec <- linear_reg(penalty = tune(),
mixture = 1) %>%
set_engine("glmnet")
# generate folds for tuning
set.seed(144)
lasso_folds <- vfold_cv(pressure_train)
# generate a grid for tuning
lambda_grid <- grid_regular(penalty(), levels = 50)
# tune model
set.seed(211)
lasso_grid <- tune_grid(
wf %>% add_model(lasso_spec),
resamples = lasso_folds,
grid = lambda_grid
)
# visualize tuning results
# lasso_grid %>% collect_metrics() %>%
# ggplot(aes(penalty, mean, color=.metric))+
# geom_errorbar(aes(ymin = mean - std_err,
# ymax = mean + std_err), alpha = .4)+
# geom_point(size=2)+
# geom_line(size=1.5)+
# scale_x_log10()+
# facet_wrap(~.metric,nrow=2,scales="free")
# get best rsq value
best_rsq <- lasso_grid %>% select_best(metric = "rsq")
# best_rmse <- lasso_grid %>% select_best(metric = "rmse")
# finalize model wf
final_lasso_wf <- finalize_workflow(
wf %>% add_model(lasso_spec),
best_rsq
# best_rmse
)
# finalize model
final_lasso_fit <- last_fit(final_lasso_wf, pressure_split)
# save r2
r2 <- final_lasso_fit %>% collect_metrics() %>% select(.estimate) %>% slice(2)
# visualize variable importance on entire data
gg_lasso_importance <- final_lasso_wf %>%
fit(df) %>%
# fit(pressure_train) %>%
pull_workflow_fit() %>%
vi(lambda = best_rsq$penalty) %>%
mutate(Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance)) %>%
ggplot(aes(x=Importance,y=Variable,fill=Sign))+
geom_col()+
labs(y=NULL)+
scale_x_continuous(expand = c(0,0))+
ggtitle(paste0("Variable importance using LASSO"))+
labs(subtitle = paste0("Model R^2: ", round(r2,4)))
gg_lasso_importance
# split train/test
set.seed(3332)
pressure_split <- initial_split(df)
pressure_train <- training(pressure_split)
pressure_test <- testing(pressure_split)
# rf recipe
pressure_rec <- recipe(lot_yield ~ ., data = pressure_train) %>%
prep()
# rf model specification
rf_spec <- rand_forest(mode="regression",
mtry = tune(),
trees = 1000,
min_n = tune()) %>%
set_engine("randomForest")
# workflow
wf <- workflow() %>%
add_recipe(pressure_rec)
# generate folds for tuning
set.seed(131)
rf_folds <- vfold_cv(pressure_train)
# tune on non-specific grid
tune_res <- tune_grid(
wf %>% add_model(rf_spec),
resamples = rf_folds,
grid = 20
)
# visualize first tuning results
tune_res %>% collect_metrics() %>%
filter(.metric == "rsq") %>%
select(mean, min_n, mtry) %>%
pivot_longer(min_n:mtry, values_to ="value", names_to = "parameter") %>%
ggplot(aes(value,mean,color=parameter))+
geom_point()+
geom_line()+
facet_wrap(~parameter, nrow=2, scales="free")
# generate more specific grid
rf_grid <- grid_regular(
mtry(range = c(1,5)),
min_n(range = c(10,30)),
levels = 7
)
# tune on more specific grid
regular_res <- tune_grid(
wf %>% add_model(rf_spec),
resamples = rf_folds,
grid = rf_grid
)
# visualize second tuning results
# regular_res %>% collect_metrics() %>%
# filter(.metric == "rsq") %>%
# mutate(min_n = as.factor(min_n)) %>%
# ggplot(aes(mtry, mean, color = min_n))+
# geom_point()+
# geom_line(size=1.2)
# store best models
best_rsq <- regular_res %>% select_best(metric="rsq")
best_rmse <- regular_res %>% select_best(metric="rmse")
# finalize model
final_rf <- finalize_model(
rf_spec,
best_rsq
)
# variable importance on training
# final_rf %>%
# set_engine("randomForest") %>%
# fit(lot_yield ~ ., juice(pressure_rec)) %>%
# vip(geom = "point")
# final random forest workflow
final_rf_wf <- workflow() %>%
add_recipe(pressure_rec) %>%
add_model(final_rf)
# fit model on test set
final_rf_res <- final_rf_wf %>%
last_fit(pressure_split)
# metrics for final model
r2 <- final_rf_res %>% collect_metrics() %>% select(.estimate) %>% slice(2)
# plot predictions vs results on test set
# final_rf_res %>% collect_predictions() %>%
# ggplot(aes(.pred, lot_yield))+
# geom_point()+geom_abline(lty=2,color='red')
# final model
# final_rf_model <- fit(final_rf_wf, df)
# variable importance on all data
gg_rf_importance <- final_rf %>%
set_engine("randomForest") %>%
fit(lot_yield ~ ., df) %>%
vip(geom = "point")+
ggtitle(paste0("Variable importance using RandomForest"))+
labs(subtitle = paste0("Model R^2: ", round(r2,4)))
gg_rf_importance
Recall the most significant variables from the earlier linear model based on p-value. Low p-value generally indicates variables are not simply due to random effects. The rather low \(R^2\) indicates that only about 10% of the variability can be explained by the model.
kable(lm_tidy %>% dplyr::filter(kiln == "D"),
format="html",
caption = paste0("Significant variables by p-value, Model R^2: ",
round(lm_glance$r.squared[[4]], 3)),
escape = F) %>%
pack_rows( index=c("Kiln D" = 14)) %>%
kable_styling("striped", full_width=T)
| kiln | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|---|
| Kiln D | |||||
| D | (Intercept) | 0.91481 | 0.05624 | 16.26736 | 0 |
| D | time_betw_02_03 | -0.05878 | 0.02283 | -2.57484 | 0.01205 |
| D | aucDiff | 0.00000 | 0.00000 | -2.18340 | 0.03222 |
| D | snow_fall | 0.02275 | 0.01551 | 1.46668 | 0.14676 |
| D | time_less_00 | 0.00007 | 0.00006 | 1.35300 | 0.18023 |
| D | time_betw_01_02 | -0.00128 | 0.00117 | -1.08975 | 0.27941 |
| D | snow_depth | -0.00339 | 0.00344 | -0.98540 | 0.32768 |
| D | time_greater_03 | 0.01171 | 0.01829 | 0.64028 | 0.524 |
| D | sd_press | -3.84453 | 7.07149 | -0.54367 | 0.58833 |
| D | time_betw_00_01 | 0.00004 | 0.00009 | 0.48635 | 0.62817 |
| D | mean_press | 4.34886 | 14.00702 | 0.31048 | 0.75708 |
| D | temp_min | -0.00027 | 0.00105 | -0.25494 | 0.79948 |
| D | precip | 0.00619 | 0.02769 | 0.22368 | 0.82363 |
| D | temp_max | 0.00008 | 0.00095 | 0.07914 | 0.93714 |
LASSO regression requires scaled and centered variables and works by removing variables with high correlation or little importance by adjusting their coefficients to zero. This is still a type of linear regression but generally more reliable than relying on p-value.
# original data
df <- press_summ %>%
dplyr::filter(kiln == "D") %>%
dplyr::select(mean_press:lot_yield, -kiln)
# dplyr::select(precip:lot_yield, -kiln)
# splits
set.seed(32)
pressure_split <- initial_split(df)
pressure_train <- training(pressure_split)
pressure_test <- testing(pressure_split)
# lasso recipe
pressure_rec <- recipe(lot_yield ~ ., data = pressure_train) %>%
step_normalize(all_numeric(), -all_outcomes()) %>%
prep()
# lasso workflow
wf <- workflow() %>%
add_recipe(pressure_rec)
# lasso model specification
lasso_spec <- linear_reg(penalty = tune(),
mixture = 1) %>%
set_engine("glmnet")
# generate folds for tuning
set.seed(144)
lasso_folds <- vfold_cv(pressure_train)
# generate a grid for tuning
lambda_grid <- grid_regular(penalty(), levels = 50)
# tune model
set.seed(211)
lasso_grid <- tune_grid(
wf %>% add_model(lasso_spec),
resamples = lasso_folds,
grid = lambda_grid
)
# visualize tuning results
# lasso_grid %>% collect_metrics() %>%
# ggplot(aes(penalty, mean, color=.metric))+
# geom_errorbar(aes(ymin = mean - std_err,
# ymax = mean + std_err), alpha = .4)+
# geom_point(size=2)+
# geom_line(size=1.5)+
# scale_x_log10()+
# facet_wrap(~.metric,nrow=2,scales="free")
# get best rsq value
best_rsq <- lasso_grid %>% select_best(metric = "rsq")
# best_rmse <- lasso_grid %>% select_best(metric = "rmse")
# finalize model wf
final_lasso_wf <- finalize_workflow(
wf %>% add_model(lasso_spec),
best_rsq
# best_rmse
)
# finalize model
final_lasso_fit <- last_fit(final_lasso_wf, pressure_split)
# save r2
r2 <- final_lasso_fit %>% collect_metrics() %>% select(.estimate) %>% slice(2)
# visualize variable importance on entire data
gg_lasso_importance <- final_lasso_wf %>%
fit(df) %>%
# fit(pressure_train) %>%
pull_workflow_fit() %>%
vi(lambda = best_rsq$penalty) %>%
mutate(Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance)) %>%
ggplot(aes(x=Importance,y=Variable,fill=Sign))+
geom_col()+
labs(y=NULL)+
scale_x_continuous(expand = c(0,0))+
ggtitle(paste0("Variable importance using LASSO"))+
labs(subtitle = paste0("Model R^2: ", round(r2,4)))
gg_lasso_importance
# split train/test
set.seed(3332)
pressure_split <- initial_split(df)
pressure_train <- training(pressure_split)
pressure_test <- testing(pressure_split)
# rf recipe
pressure_rec <- recipe(lot_yield ~ ., data = pressure_train) %>%
prep()
# rf model specification
rf_spec <- rand_forest(mode="regression",
mtry = tune(),
trees = 1000,
min_n = tune()) %>%
set_engine("randomForest")
# workflow
wf <- workflow() %>%
add_recipe(pressure_rec)
# generate folds for tuning
set.seed(131)
rf_folds <- vfold_cv(pressure_train)
# tune on non-specific grid
tune_res <- tune_grid(
wf %>% add_model(rf_spec),
resamples = rf_folds,
grid = 20
)
# visualize first tuning results
tune_res %>% collect_metrics() %>%
filter(.metric == "rsq") %>%
select(mean, min_n, mtry) %>%
pivot_longer(min_n:mtry, values_to ="value", names_to = "parameter") %>%
ggplot(aes(value,mean,color=parameter))+
geom_point()+
geom_line()+
facet_wrap(~parameter, nrow=2, scales="free")
# generate more specific grid
rf_grid <- grid_regular(
min_n(range = c(1,20)),
mtry(range = c(10,16)),
levels = 6
)
# tune on more specific grid
regular_res <- tune_grid(
wf %>% add_model(rf_spec),
resamples = rf_folds,
grid = rf_grid
)
# visualize second tuning results
regular_res %>% collect_metrics() %>%
filter(.metric == "rsq") %>%
mutate(min_n = as.factor(min_n)) %>%
ggplot(aes(mtry, mean, color = min_n))+
geom_point()+
geom_line(size=1.2)
# store best models
best_rsq <- regular_res %>% select_best(metric="rsq")
best_rmse <- regular_res %>% select_best(metric="rmse")
# finalize model
final_rf <- finalize_model(
rf_spec,
best_rsq
)
# variable importance on training
# final_rf %>%
# set_engine("randomForest") %>%
# fit(lot_yield ~ ., juice(pressure_rec)) %>%
# vip(geom = "point")
# final random forest workflow
final_rf_wf <- workflow() %>%
add_recipe(pressure_rec) %>%
add_model(final_rf)
# fit model on test set
final_rf_res <- final_rf_wf %>%
last_fit(pressure_split)
# metrics for final model
r2 <- final_rf_res %>% collect_metrics() %>% select(.estimate) %>% slice(2)
# plot predictions vs results on test set
# final_rf_res %>% collect_predictions() %>%
# ggplot(aes(.pred, lot_yield))+
# geom_point()+geom_abline(lty=2,color='red')
# final model
# final_rf_model <- fit(final_rf_wf, df)
# variable importance on all data
gg_rf_importance <- final_rf %>%
set_engine("randomForest") %>%
fit(lot_yield ~ ., df) %>%
vip(geom = "point")+
ggtitle(paste0("Variable importance using RandomForest"))+
labs(subtitle = paste0("Model R^2: ", round(r2,4)))
gg_rf_importance
press_summ %>%
dplyr::filter(kiln == "D") %>%
dplyr::select(precip:lot_yield, -kiln) %>%
ggplot(aes(x=temp_max,y=lot_yield))+
geom_point()